Vignette for integrated Seurat/Signac analysis: https://stuartlab.org/signac/articles/pbmc_multiomic.html
PBMC_Multiome_10xTn5_CellRangerARC_fragments.tsv.gzPBMC_Multiome_10xTn5_CellRangerARC/raw_feature_bc_matrix PBMC_Multiome_Tn5hc_50perc_CellRangerARC_fragments.tsv.gz PBMC_Multiome_Tn5hc_50perc_CellRangerARC/raw_feature_bc_matrix PBMC_Multiome_Tn5hc_CellRangerARC_fragments.tsv.gz PBMC_Multiome_Tn5hc_CellRangerARC/raw_feature_bc_matrix RawData_scATAC_*.csv 20230912_PBMC_Multiome_scATAC_UMAP.ipynb RawData_scRNA_*.csv 20230912_PBMC_Multiome_scRNA_UMAP.ipynb 20230912_PBMC_Multiome_Co-embedding.ipynb
### Libraries
library(magrittr)
library(qs)
library(Signac)
library(Seurat)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ggplot2)
library(ggpubr)
### Set seed
set.seed(13)
library(future)
plan("multicore", workers = 20)
options(future.globals.maxSize = 500000 * 1024^2) # for 500 Gb RAM
### Samples
samples <- c("PBMC_scATAC", "PBMC_scTurboATAC_16", "PBMC_scTurboATAC_13")
### Sample palette
sample_palette <- c("grey", "blue", "darkblue")
names(sample_palette) <- samples
### Get gene annotations for hg38
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"
### Data
inputFiles <- c("PBMC_Multiome_10xTn5_CellRangerARC_fragments.tsv.gz",
"PBMC_Multiome_Tn5hc_50perc_CellRangerARC_fragments.tsv.gz",
"PBMC_Multiome_Tn5hc_CellRangerARC_fragments.tsv.gz")
names(inputFiles) <- samples
### Filtered ATAC cells
atac_cells <- list()
for (sample in samples){
atac_cells[[sample]] <- read.table(paste0("RawData_scATAC_",
sample, ".csv"), sep = ",", header = TRUE)
colnames(atac_cells[[sample]])[1] <- "CellID_ArchR"
atac_cells[[sample]]$CellID <- gsub(paste0(sample, "#"), "", atac_cells[[sample]]$CellID_ArchR)
}
### Filtered RNA cells
rna_cells <- list()
for (sample in samples){
rna_cells[[sample]] <- read.table(paste0("RawData_scRNA_",
sample, ".csv"), sep = ",", header = TRUE)
colnames(rna_cells[[sample]])[1] <- "CellID_Seurat"
rna_cells[[sample]]$CellID <- gsub(paste0(sample, "_"), "", rna_cells[[sample]]$CellID_Seurat)
}
input_data <- list()
for (sample in samples){
input_data[[sample]] <- Read10X(data.dir = paste0(sample, "/raw_feature_bc_matrix"))
}
proj_list <- list()
for (sample in samples){
proj_list[[sample]] <- CreateSeuratObject(counts = input_data[[sample]][["Gene Expression"]], project = sample, assay = "RNA")
proj_list[[sample]]$orig.ident <- sample
}
for (sample in samples){
proj_list[[sample]]$percent.mt <- PercentageFeatureSet(proj_list[[sample]], pattern = "^MT-")
}
frag_list <- list()
for (sample in samples){
frag_list[[sample]] <- CreateFragmentObject(inputFiles[sample])
}
atac_list <- list()
for (sample in samples){
proj_list[[sample]][["ATAC"]] <- CreateChromatinAssay(counts = input_data[[sample]][["Peaks"]], fragments = frag_list[[sample]],
genome = 'hg38', sep = c(":", "-"), annotation = annotation)
}
for (sample in samples){
DefaultAssay(proj_list[[sample]]) <- "ATAC"
proj_list[[sample]] <- TSSEnrichment(proj_list[[sample]])
proj_list[[sample]] <- NucleosomeSignal(proj_list[[sample]])
}
proj_list
plot_list <- list()
for (sample in samples){
plot_list[[paste0(sample, "_nCountRNA_nCountATAC")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nCount_ATAC",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") +
scale_x_log10(labels = scales::comma) + scale_y_log10(labels = scales::comma) +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)"))
plot_list[[paste0(sample, "_percentMT_TSSenrichment")]] <- FeatureScatter(proj_list[[sample]], feature1 = "percent.mt", feature2 = "TSS.enrichment",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)"))
plot_list[[paste0(sample, "_percentMT_NucleosomeRatio")]] <- FeatureScatter(proj_list[[sample]], feature1 = "percent.mt", feature2 = "nucleosome_signal",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)"))
}
options(repr.plot.width=18, repr.plot.height=5*length(samples))
ggarrange(plotlist=plot_list, ncol=3, nrow=length(samples))
summary <- lapply(samples,
function(sample){c(sample, '01_raw',
nrow(proj_list[[sample]]@meta.data),
median(proj_list[[sample]]$nFeature_RNA, na.rm = TRUE) %>% round(., 0),
median(proj_list[[sample]]$nCount_RNA, na.rm = TRUE) %>% round(., 0),
median(proj_list[[sample]]$percent.mt, na.rm = TRUE) %>% round(., 2),
median(proj_list[[sample]]$nFeature_ATAC, na.rm = TRUE) %>% round(., 0),
median(proj_list[[sample]]$nCount_ATAC, na.rm = TRUE) %>% round(., 0),
median(proj_list[[sample]]$TSS.enrichment, na.rm = TRUE) %>% round(., 2),
median(proj_list[[sample]]$nucleosome_signal, na.rm = TRUE) %>% round(., 2))}) %>% do.call(rbind, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature_RNA', 'nCount_RNA', 'percent.mt', 'nFeature_ATAC', 'nCount_ATAC', 'TSSenrichment', 'NucleosomeSignal')
summary
HQ_cells <- data.frame(Sample = samples,
HQ_atac_cells = sapply(atac_cells, nrow),
HQ_rna_cells = sapply(rna_cells, nrow),
HQ_cells = sapply(samples, function(sample){sum(atac_cells[[sample]]$CellID %in% rna_cells[[sample]]$CellID)}))
HQ_cells
for (sample in samples){
proj_list[[sample]]$HQ_cell <- rownames(proj_list[[sample]]@meta.data) %in% atac_cells[[sample]]$CellID[which(atac_cells[[sample]]$CellID %in% rna_cells[[sample]]$CellID)]
proj_list[[sample]] <- subset(proj_list[[sample]], subset = HQ_cell)
}
summary <- lapply(samples,
function(sample){c(sample, '02_HQcells',
nrow(proj_list[[sample]]@meta.data),
median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
median(proj_list[[sample]]$percent.mt) %>% round(., 2),
median(proj_list[[sample]]$nFeature_ATAC) %>% round(., 0),
median(proj_list[[sample]]$nCount_ATAC) %>% round(., 0),
median(proj_list[[sample]]$TSS.enrichment) %>% round(., 2),
median(proj_list[[sample]]$nucleosome_signal) %>% round(., 2))}) %>% do.call(rbind, .) %>% rbind(summary, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature_RNA', 'nCount_RNA', 'percent.mt', 'nFeature_ATAC', 'nCount_ATAC', 'TSSenrichment', 'NucleosomeSignal')
summary
plot_list <- list()
for (feature in c("nFeature_ATAC", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal", "nFeature_RNA", "nCount_RNA", "percent.mt")){
plot_list[[feature]] <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x@meta.data[, feature])}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
xlab("") + ylab(feature) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
}
plot_list$nCount_ATAC <- plot_list$nCount_ATAC + scale_y_log10(labels = scales::comma) + ylab("log10(nCount_ATAC)")
plot_list$nCount_RNA <- plot_list$nCount_RNA + scale_y_log10(labels = scales::comma) + ylab("log10(nCount_RNA)")
options(repr.plot.width=15, repr.plot.height=12)
ggarrange(plotlist = plot_list, ncol = 4, nrow = 2)
plot_list <- list()
for (sample in samples){
plot_list[[paste0(sample, "_nCountRNA_nCountATAC")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nCount_ATAC",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") +
scale_x_log10(labels = scales::comma) + scale_y_log10(labels = scales::comma) +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)"))
plot_list[[paste0(sample, "_percentMT_TSSenrichment")]] <- FeatureScatter(proj_list[[sample]], feature1 = "percent.mt", feature2 = "TSS.enrichment",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)"))
plot_list[[paste0(sample, "_percentMT_NucleosomeRatio")]] <- FeatureScatter(proj_list[[sample]], feature1 = "percent.mt", feature2 = "nucleosome_signal",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)"))
}
options(repr.plot.width=18, repr.plot.height=5*length(samples))
ggarrange(plotlist=plot_list, ncol=3, nrow=length(samples))
for (sample in samples){
DefaultAssay(proj_list[[sample]]) <- "ATAC"
proj_list[[sample]] <- FindTopFeatures(proj_list[[sample]], min.cutoff = 5)
proj_list[[sample]] <- RunTFIDF(proj_list[[sample]])
}
for (sample in samples){
DefaultAssay(proj_list[[sample]]) <- "RNA"
proj_list[[sample]] <- SCTransform(proj_list[[sample]])
}
for (sample in samples){
DefaultAssay(proj_list[[sample]]) <- "ATAC"
proj_list[[sample]] <- RunSVD(proj_list[[sample]])
}
plot_list <- list()
for (sample in samples){
plot_list[[sample]] <- ElbowPlot(proj_list[[sample]], reduction = "lsi", ndims = 50)
}
options(repr.plot.width=6*length(samples), repr.plot.height=5)
ggarrange(plotlist = plot_list, ncol = length(samples), nrow = 1)
LSIs_list <- list(25, 25, 25)
names(LSIs_list) <- samples
for (sample in samples){
DefaultAssay(proj_list[[sample]]) <- "SCT"
proj_list[[sample]] <- RunPCA(proj_list[[sample]])
}
plot_list <- list()
for (sample in samples){
plot_list[[sample]] <- ElbowPlot(proj_list[[sample]], reduction = "pca", ndims = 50)
}
options(repr.plot.width=6*length(samples), repr.plot.height=5)
ggarrange(plotlist = plot_list, ncol = length(samples), nrow = 1)
PCs_list <- list(30, 30, 30)
names(PCs_list) <- samples
for (sample in samples){
proj_list[[sample]] <- FindMultiModalNeighbors(proj_list[[sample]], reduction.list = list("pca", "lsi"),
dims.list = list(1:PCs_list[[sample]], 2:LSIs_list[[sample]]),
modality.weight.name = "RNA.weight")
for (resolution in c(0.2, 0.5, 0.8)){
proj_list[[sample]] <- FindClusters(proj_list[[sample]], resolution = resolution, graph.name = "wsnn")
}
}
for (sample in samples){
proj_list[[sample]] <- RunUMAP(proj_list[[sample]], nn.name = "weighted.nn", assay = "RNA")
}
plot_list <- list()
for (sample in samples){
plot_list[[sample]] <- list()
plot_list[[sample]]$clusters0.2 <- DimPlot(proj_list[[sample]], reduction = "umap", group.by = "wsnn_res.0.2")
plot_list[[sample]]$clusters0.5 <- DimPlot(proj_list[[sample]], reduction = "umap", group.by = "wsnn_res.0.5")
plot_list[[sample]]$clusters0.8 <- DimPlot(proj_list[[sample]], reduction = "umap", group.by = "wsnn_res.0.8")
plot_list[[sample]]$nCount_RNA <- FeaturePlot(proj_list[[sample]], reduction = "umap", features = "nCount_RNA")
plot_list[[sample]]$nFeature_RNA <- FeaturePlot(proj_list[[sample]], reduction = "umap", features = "nFeature_RNA")
plot_list[[sample]]$percentMT <- FeaturePlot(proj_list[[sample]], reduction = "umap", features = "percent.mt")
plot_list[[sample]]$nCount_ATAC <- FeaturePlot(proj_list[[sample]], reduction = "umap", features = "nCount_ATAC")
plot_list[[sample]]$nFeature_ATAC <- FeaturePlot(proj_list[[sample]], reduction = "umap", features = "nFeature_ATAC")
plot_list[[sample]]$NucRatio <- FeaturePlot(proj_list[[sample]], reduction = "umap", features = "nucleosome_signal")
plot_list[[sample]]$TSSenrichment <- FeaturePlot(proj_list[[sample]], reduction = "umap", features = "TSS.enrichment")
}
options(repr.plot.width=20, repr.plot.height=24)
for (sample in samples){
ggarrange(plotlist = plot_list[[sample]], ncol = 3, nrow = 4) %>% print(.)
}
### Save Seurat object
qsave(proj_list, 'SeuratObjects.qs')
### Save Sample statistics
write.table(summary, 'Summary.csv', sep=',', col.names = T, row.names = F)
### Save raw data
lapply(samples, function(x){all(rownames(proj_list[[sample]]@meta.data) == rownames(proj_list[[sample]]@reductions$umap@cell.embeddings)) %>% print(.);
df <- cbind(proj_list[[sample]]@meta.data,
proj_list[[sample]]@reductions$umap@cell.embeddings)
write.csv(df, paste0("RawData_", x, ".csv"))})
Isabelle Seufert
Division of Chromatin Networks, DKFZ
12.09.2023
sessionInfo()